Cross Referencing Endowment Values

endowment_data <- read_rds("./data/endowment_filter_data_990.RDS") 

companies_to_ein <- read_csv("./data/companies.csv") %>%
  mutate(EIN = as.character(ein))
# make kable table with consistent formatting
make_table <- function(df, title = "", ...) {
  title <- paste0("<center><span style = 'font-size:150%;color:black'><b>",
                  title,
                  "</span></b><center>")
   as_tibble(df) %>%
    kbl(caption = title, ... ) %>%
    kable_material() %>%
    row_spec(row=0, background = "#43494C" , color = "white", bold = TRUE)
}
# plot missingness for a given variable 
plot_missing <- function(variable) {
  
  endowment_data %>%
    group_by(EIN) %>%
    # number of observations where variable is not NA 
    summarize(number_observations = sum(!is.na(!!sym(variable)))) %>%
    group_by(number_observations) %>%
    # number of EINs with each value of number_observations
    summarize(n_ein=n()) %>%
    ggplot(aes(x = number_observations, y =n_ein ))+
    geom_bar(stat="identity") +
    labs(y = "Number of Companies",
         x = paste0("Number of Observations where\n",
         variable, " was Not Missing"),
         title =paste0("Missingness for ", variable)) +
    theme_bw() +
    theme(plot.title = element_text(face = "bold", hjust = .5)) 

}




# compare values from CY to CYM* for given variable
# returns data frame that contains the difference between the CY value and 
# corresonding CM* values
# for example, the difference between the CY value for 2016 would be compared 
# to the CYM1 value for 2017, and the CYM2 value for 2018, and so on 
check_variable <- function(variable_name,
                           data) {
  
  
  base_name <- variable_name
  var <- paste0("CY", base_name)
  vars <- paste0("CYM", c( 1:4), base_name)
  
  # plt <- plot_missing(var)
  # print(plt)
  
  eins_with_variable <- data %>%
    group_by(EIN) %>%
    summarize(number_observations = sum(!is.na(!!sym(var)))) %>%
    filter(number_observations != 0) %>%
    pull(EIN)
  
  
  # the goal here is to create a row for each fiscal year, with NAs if 
  # there are no observations for that year
  # this is needed so that we have consecutive years, which is important
  # for substraction using lag() to work correctly

 data <- data %>%
  filter(EIN %in% eins_with_variable) %>%
  select(EIN, fiscal_year, contains(base_name)) %>%
   pivot_wider(names_from = fiscal_year, 
             #  names_prefix = "fiscalyear",
               values_from=contains(base_name)) %>%
   pivot_longer(cols = contains(base_name),
                names_to = "variable_year") %>%
   separate(variable_year, sep = "_", into = c("variable_name", "fiscal_year")) %>%
   pivot_wider(names_from = variable_name, values_from = value) %>%
   mutate(fiscal_year = as.factor(as.numeric(fiscal_year)))
 

  
  crossref <- data %>%
    group_by(EIN) %>%
    arrange(fiscal_year) %>%
    # lag corresponds to how far back the current year comparison should be
    # vars contains the CM* variables that represent reporting for years back
    # compare these CM* variables to the lagged current year (CY) variables
    mutate(
      difference_in_reported_year1 =  !!sym(vars[1]) - 
        lag(!!sym(var), n =1),
      difference_in_reported_year2 =  !!sym(vars[2]) - 
        lag(!!sym(var), n =2),
      difference_in_reported_year3 =  !!sym(vars[3]) - 
        lag(!!sym(var), n =3),
       difference_in_reported_year4 =  !!sym(vars[4]) - 
        lag(!!sym(var), n =4)
      )  %>%
    ungroup()

}

Cross Referencing Beginning Year Balance Amount

Comparison Across Years

As we might expect, we see that a higher proportion had a nonzero difference between the cross referenced reports for years further back in time. That is, reporting tended to be more accurate for most recent years.

crossref <- check_variable("BeginningYearBalanceAmt", data = endowment_data)
plot_missing("CYBeginningYearBalanceAmt")

# plot fraction where there was a difference between 
# the reports by year
crossref %>% 
  select(EIN, contains("difference")) %>%
  pivot_longer(cols = contains("difference")) %>%
  filter(!is.na(value)) %>%
  group_by(name)  %>%
  summarize(number_zeros = sum(ifelse(value == 0, 1,0)),
            total_reports = n(),
            fraction = 1-( number_zeros / total_reports)) %>%
  mutate(name = gsub("difference_in_reported_year", "", name)) %>%
  ggplot(aes(x=name, y = fraction)) +
  geom_bar(stat ="identity", fill = "#234A77") +
  geom_label(aes(label = round(fraction,2))) +
  labs(title = paste0("Fraction of Differences that Were Nonzero\n",
                      "Between Cross Referenced Reports"),
       subtitle = "By Year",
       x = "Years Between Reports Compared",
       y = "Fraction with Nonzero Difference") +
  theme_bw() +
  theme(plot.title = element_text(hjust = .5, face="bold"),
        plot.subtitle = element_text(hjust = .5, face="italic"))

We also see we have fewer total counts of non-missing reports as we go further back in time.

# stacked chart, note we can't see how nonzero counts are changing 
# relative to the total counts
crossref %>%
  select(EIN, contains("difference"), fiscal_year) %>%
  pivot_longer(cols = contains("difference")) %>%
  filter(!is.na(value)) %>%
  group_by(name)  %>%
  summarize(zero = sum(ifelse(value == 0, 1,0)),
            nonzero = sum(ifelse(value == 0, 0,1))) %>%
  # notice each row represents a fiscal_year-EIN-difference_type 
  pivot_longer(cols = c(zero, nonzero),
               names_to = "source",
               values_to = "count") %>%
  mutate(name = gsub("difference_in_reported_year", "", name),
         source = ifelse(source == "nonzero",
                         "Nonzero Difference", 
                         "Zero Difference")) %>%
  ggplot(aes(x=name, y = count, fill = source)) +
  geom_bar(stat ="identity", position = "stack", alpha = .8) +
  geom_label(aes(label = round(count,3), y = count, color = source),
             position = "stack",
             size = 2.6,
             label.padding = unit(.1, "lines"),
             fill = "white",
             fontface="bold",
             show.legend = FALSE) +
  labs(title = "Number of Zero and Nonzero Differences\nBetween Cross Referenced Reports",
       subtitle = "By Year",
       x = "Years Between Reports Compared",
       y = "Count",
       fill = "") +
  theme_bw() +
  theme(plot.title = element_text(size = 16, hjust = .5, face="bold"),
        plot.subtitle = element_text(hjust = .5, face="italic"),
        axis.text.x = element_text(size = 13),
        axis.title = element_text(size = 16, face = "bold"))

Companies with Discordance in Reported Values

# difference represents What They Reported as CY Minus X Years - What They Reported at The Time

companies_different <- crossref %>%
  pivot_longer(cols = contains("difference")) %>%
  select(EIN, fiscal_year, name, value) %>%
  filter(value > 0) %>%
   left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
  arrange(organization_name) %>%
  pull(EIN) %>%
  unique()
  
crossref %>%
  pivot_longer(cols = contains("difference")) %>%
  select(EIN, fiscal_year, name, value) %>%
  filter(value > 0) %>%
  left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
  mutate(year = substr(name, nchar(name), nchar(name)),
         year = paste0("Comparing Current<br> Year Minus ",
                       year)) %>%
  arrange(organization_name) %>%
  select(`Organization Name` = organization_name,
         `Difference in Years` = year, 
         `Fiscal Year` = fiscal_year,
         `Recent  - Previously Reported` = value) %>%
  make_table(title = paste0(
    "Comparing Values Reported in More Recent Report to Those Previously Reported:<br>",
    "<i>Number of Companies that have at Least One Report Not Concordant: </i>",
    length(companies_different)),
             digits = 3, 
             format.args = list(
               big.mark = ",",
               scientific = FALSE),
    escape=FALSE,
    booktabs=TRUE)  %>%
  scroll_box(height = "450px",
             width = "100%") 
Comparing Values Reported in More Recent Report to Those Previously Reported:
Number of Companies that have at Least One Report Not Concordant: 5
Organization Name Difference in Years Fiscal Year Recent - Previously Reported
Ballet Arizona Comparing Current
Year Minus 1
2018 4,025,025
Ballet Arizona Comparing Current
Year Minus 2
2018 500,000
Ballet Arizona Comparing Current
Year Minus 2
2019 4,025,025
Ballet Arizona Comparing Current
Year Minus 3
2019 500,000
Ballet Arizona Comparing Current
Year Minus 3
2020 4,025,025
Ballet Arizona Comparing Current
Year Minus 4
2020 500,000
Fort Wayne Ballet Comparing Current
Year Minus 1
2018 26,128
Fort Wayne Ballet Comparing Current
Year Minus 1
2019 13,343
Fort Wayne Ballet Comparing Current
Year Minus 2
2019 148,799
Fort Wayne Ballet Comparing Current
Year Minus 2
2020 13,343
Fort Wayne Ballet Comparing Current
Year Minus 3
2020 148,799
Pacific Northwest Ballet Comparing Current
Year Minus 1
2019 3,000
Pacific Northwest Ballet Comparing Current
Year Minus 2
2020 3,000
San Francisco Ballet Comparing Current
Year Minus 1
2017 107,033,401
San Francisco Ballet Comparing Current
Year Minus 2
2017 105,867,772
San Francisco Ballet Comparing Current
Year Minus 2
2018 107,033,401
San Francisco Ballet Comparing Current
Year Minus 3
2018 105,867,772
San Francisco Ballet Comparing Current
Year Minus 3
2019 107,033,401
San Francisco Ballet Comparing Current
Year Minus 4
2019 105,867,772
San Francisco Ballet Comparing Current
Year Minus 4
2020 107,033,401
The Alabama Ballet Comparing Current
Year Minus 1
2019 227,040
The Alabama Ballet Comparing Current
Year Minus 2
2020 227,040
The Alabama Ballet Comparing Current
Year Minus 3
2020 219,787
The Alabama Ballet Comparing Current
Year Minus 4
2020 254,152

We see that values are repeated because if there is some value that is quite off, say for 2016, then this shows up in the CYM1 for 2017, but also CYM2 for 2018, CYM3 for 2019 and so on.

Tables of Reported Values for Each Company with Discordance in Reported Values

Observations:

  • We see in some cases, the problematic reports are clear initially. This is the case in San Francisco Ballet, Ballet Arizona, or the Alabama Ballet.
  • The differences for Fort Wayne Ballet and the Pacific Northwest Ballet are more subtle.
# iterate through EINs where there was discordance and
# generate a table so we can better see what's going on

variable_name <- "BeginningYearBalanceAmt"

walk(1:length(companies_different), ~{
  name <- companies_to_ein %>%
    filter(EIN == companies_different[.x]) %>%
    pull(organization_name)
  
  table <- crossref %>% 
    rename_with(cols=everything(), ~gsub(variable_name, "", .)) %>%
    filter(EIN %in% companies_different[.x]) %>%
    select(-c(EIN, contains("difference"))) %>%
    make_table(title = paste0("Reports for ",
                              name, "<br>EIN: ", 
                              companies_different[.x],
                               ", Variable: ", variable_name))
  
  print(table)
  
#  print(table)

})
Reports for Ballet Arizona
EIN: 860367773, Variable: BeginningYearBalanceAmt
fiscal_year CY CYM1 CYM2 CYM3 CYM4
2014 NA NA NA NA NA
2015 101399 100399 100399 100399 100399
2016 101399 101399 100399 100399 100399
2017 101399 101399 101399 100399 100399
2018 4746867 4126424 601399 NA NA
2019 4732822 4746867 4126424 601399 NA
2020 4670922 4732822 4746867 4126424 601399
2021 NA NA NA NA NA
Reports for Fort Wayne Ballet
EIN: 356006394, Variable: BeginningYearBalanceAmt
fiscal_year CY CYM1 CYM2 CYM3 CYM4
2014 NA NA NA NA NA
2015 NA NA NA NA NA
2016 NA NA NA NA NA
2017 1264981 1191922 1174150 36538 36538
2018 1413780 1291109 1219104 1201082 60137
2019 1415612 1427123 1413780 1291109 1219104
2020 1422619 1415612 1427123 1413780 1291109
2021 NA NA NA NA NA
Reports for Pacific Northwest Ballet
EIN: 910897129, Variable: BeginningYearBalanceAmt
fiscal_year CY CYM1 CYM2 CYM3 CYM4
2014 NA NA NA NA NA
2015 16919887 15702692 14297377 14671226 12528462
2016 17026097 16919887 15702692 14297377 14671226
2017 15778987 17026097 16919887 15702692 14297377
2018 18145008 15778987 17026097 16919887 15702692
2019 19254508 18148008 15778987 17026097 16919887
2020 19741644 19254508 18148008 15778987 17026097
2021 NA NA NA NA NA
Reports for San Francisco Ballet
EIN: 941415298, Variable: BeginningYearBalanceAmt
fiscal_year CY CYM1 CYM2 CYM3 CYM4
2014 NA NA NA NA NA
2015 174 174 174 1035814 2318646
2016 174 174 174 174 1035814
2017 100219009 107033575 105867946 92513161 79137681
2018 119047942 100219009 107033575 105867946 92513161
2019 125015507 119047942 100219009 107033575 105867946
2020 124452193 125015507 119047942 100219009 107033575
2021 NA NA NA NA NA
Reports for The Alabama Ballet
EIN: 630813626, Variable: BeginningYearBalanceAmt
fiscal_year CY CYM1 CYM2 CYM3 CYM4
2014 NA NA NA NA NA
2015 NA NA NA NA NA
2016 250000 250000 250000 250000 250000
2017 250000 250000 250000 250000 250000
2018 250000 250000 250000 250000 250000
2019 446591 477040 250000 250000 250000
2020 430113 446591 477040 469787 504152
2021 NA NA NA NA NA
crossref %>%
  pivot_longer(cols = contains("difference")) %>%
  select(EIN, fiscal_year, name, value) %>%
  # filter(value > 0) %>%
  left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
  mutate(year = substr(name, nchar(name), nchar(name)),
         year = paste0("Comparing Current Year Minus ",
                       year)) %>%
  arrange(organization_name) %>% View()

Cross Referencing All Endowment Variables

Missingness by Variable

variables_to_check  <- endowment_data %>%
  select(contains("CY")) %>%
  colnames() %>%
  gsub("CY|CYM.", "",.) %>%
  unique()

crossref_all <- map_df(
  variables_to_check,
  ~{  variable_name <- .x
  check_variable(variable_name,
                 data = endowment_data) %>% 
    # remove variable name part of column name 
    # so we can bind rows together, add this information
    # as a separate column
    rename_with(cols=everything(), 
                ~gsub(variable_name, "", .)) %>%
    mutate(variable = .x)
})


missing_all <- map_df( variables_to_check, 
 ~ {variable <- paste0("CY",.x)
    endowment_data %>%
      group_by(EIN) %>%
      summarize(number_observations = sum(!is.na(!!sym(variable)))) %>%
      group_by(number_observations) %>%
      summarize(number_eins=n()) %>%
      mutate(variable = variable)
})
colors <- c("#58b5e1", "#49406e", "#9dd84e", "#6633b4", "#46ebdc")


missing_all %>%
  mutate(number_observations = paste0(
    "Number of EINS with ",
    number_observations,
    " Observations for this Variable" )) %>%
      ggplot(aes(x = variable, y =number_eins, fill = variable))+
      geom_bar(stat="identity",
               position = "dodge",
               show.legend=FALSE) +
      geom_label(aes(label = number_eins,
                     color = variable),
                 fill = "white",
                 vjust = .5,
                 size = 2,
                 position = position_dodge(1),
                 label.padding = unit(.1, "lines"),
                 show.legend=FALSE) +
       facet_wrap(~number_observations, ncol=1) +
  coord_flip() +
      labs(y = "Number of Companies",
           x = "Variable Name",
           title = "Comparing Missingness Across Variables") +
      theme_bw() +
      theme(plot.title = element_text(face = "bold", hjust = .5),
            axis.title = element_text(face = "bold")) +
  scale_fill_manual(values = colors) +
  scale_color_manual(values = colors) +
  scale_y_continuous(n.breaks = 8) 

Fraction Discordant by Variable

# plot fraction discordant for each variable
crossref_all %>%
  select(EIN, contains("difference"), variable) %>%
  pivot_longer( contains("difference")) %>%
  filter(!is.na(value)) %>%
  group_by(variable) %>%
  summarize(
    number_of_discordant_observations = sum(value > 1),
    total_observations_of_variable = n(),
    fraction_discordant = number_of_discordant_observations / total_observations_of_variable) %>%
  ggplot(aes(x = fct_reorder(variable,
                             fraction_discordant,
                             .desc = TRUE),
             y = fraction_discordant)) +
  geom_bar(stat="identity",
           fill = "#234A77")+
  geom_label(aes(label = round(fraction_discordant, 3))) +
  theme_bw() +
  theme(plot.title = element_text(face = "bold", hjust = .5, size = 16),
            axis.title = element_text(face = "bold", size =16),
            axis.text.x = element_text(size = 12, angle = 10, vjust = .6)) +
  labs(y = "Fraction Discordant",
       x = "Endowment Variable",
       title = "Fraction of Observations that Were Discordant for Each Variable")

# generate table displaying the discordant values for a given variable
get_discordant_table <- function(variable_name, data) {
    
    # observations with nonzero difference 
    cross_ref_for_var <- data %>%
      filter(variable == variable_name) %>%
      pivot_longer(cols = contains("difference")) %>%
      select(EIN, fiscal_year, name, value) %>%
      filter(value > 0) 
    
    # EINs that have at least one discordance 
    discordant <- cross_ref_for_var %>%
      pull(EIN) %>% unique()
    
  # generate table displaying discordances
  cross_ref_for_var %>%
    left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
    mutate(year = substr(name, nchar(name), nchar(name)),
           year = paste0("Comparing Current<br> Year Minus ",
                         year)) %>%
    arrange(organization_name) %>%
    select(`Organization Name` = organization_name,
           `Difference in Years` = year, 
           `Fiscal Year` = fiscal_year,
           `Recent  - Previously Reported` = value) %>%
    make_table(title = paste0("Variable: ", 
                              variable_name,
      "<br>Comparing Values Reported in More Recent Report to Those Previously Reported:<br>",
      "<i>Number of Companies that have at Least One Report Not Concordant: </i>",
      length(discordant)),
               digits = 3, 
               format.args = list(
                 big.mark = ",",
                 scientific = FALSE),
      escape=FALSE,
      booktabs=TRUE)  %>%
    scroll_box(height = "450px",
               width = "100%") 

}

# iterate over all variables to check and generate table
walk(variables_to_check, ~{
  table_for_var <- get_discordant_table(.x, data = crossref_all)
  print(table_for_var)
})
Variable: BeginningYearBalanceAmt
Comparing Values Reported in More Recent Report to Those Previously Reported:
Number of Companies that have at Least One Report Not Concordant: 5
Organization Name Difference in Years Fiscal Year Recent - Previously Reported
Ballet Arizona Comparing Current
Year Minus 1
2018 4,025,025
Ballet Arizona Comparing Current
Year Minus 2
2018 500,000
Ballet Arizona Comparing Current
Year Minus 2
2019 4,025,025
Ballet Arizona Comparing Current
Year Minus 3
2019 500,000
Ballet Arizona Comparing Current
Year Minus 3
2020 4,025,025
Ballet Arizona Comparing Current
Year Minus 4
2020 500,000
Fort Wayne Ballet Comparing Current
Year Minus 1
2018 26,128
Fort Wayne Ballet Comparing Current
Year Minus 1
2019 13,343
Fort Wayne Ballet Comparing Current
Year Minus 2
2019 148,799
Fort Wayne Ballet Comparing Current
Year Minus 2
2020 13,343
Fort Wayne Ballet Comparing Current
Year Minus 3
2020 148,799
Pacific Northwest Ballet Comparing Current
Year Minus 1
2019 3,000
Pacific Northwest Ballet Comparing Current
Year Minus 2
2020 3,000
San Francisco Ballet Comparing Current
Year Minus 1
2017 107,033,401
San Francisco Ballet Comparing Current
Year Minus 2
2017 105,867,772
San Francisco Ballet Comparing Current
Year Minus 2
2018 107,033,401
San Francisco Ballet Comparing Current
Year Minus 3
2018 105,867,772
San Francisco Ballet Comparing Current
Year Minus 3
2019 107,033,401
San Francisco Ballet Comparing Current
Year Minus 4
2019 105,867,772
San Francisco Ballet Comparing Current
Year Minus 4
2020 107,033,401
The Alabama Ballet Comparing Current
Year Minus 1
2019 227,040
The Alabama Ballet Comparing Current
Year Minus 2
2020 227,040
The Alabama Ballet Comparing Current
Year Minus 3
2020 219,787
The Alabama Ballet Comparing Current
Year Minus 4
2020 254,152
Variable: ContributionsAmt
Comparing Values Reported in More Recent Report to Those Previously Reported:
Number of Companies that have at Least One Report Not Concordant: 2
Organization Name Difference in Years Fiscal Year Recent - Previously Reported
Joffrey Ballet Comparing Current
Year Minus 1
2017 41,702
Joffrey Ballet Comparing Current
Year Minus 2
2018 41,702
Joffrey Ballet Comparing Current
Year Minus 3
2019 41,702
Joffrey Ballet Comparing Current
Year Minus 4
2020 41,702
San Francisco Ballet Comparing Current
Year Minus 1
2017 3,440,416
San Francisco Ballet Comparing Current
Year Minus 2
2017 6,457,496
San Francisco Ballet Comparing Current
Year Minus 2
2018 3,440,416
San Francisco Ballet Comparing Current
Year Minus 3
2018 6,457,496
San Francisco Ballet Comparing Current
Year Minus 1
2019 1,025,273
San Francisco Ballet Comparing Current
Year Minus 2
2019 2,479,840
San Francisco Ballet Comparing Current
Year Minus 3
2019 3,440,416
San Francisco Ballet Comparing Current
Year Minus 4
2019 6,457,496
San Francisco Ballet Comparing Current
Year Minus 2
2020 1,025,273
San Francisco Ballet Comparing Current
Year Minus 3
2020 2,479,840
San Francisco Ballet Comparing Current
Year Minus 4
2020 3,440,416
Variable: InvestmentEarningsOrLossesAmt
Comparing Values Reported in More Recent Report to Those Previously Reported:
Number of Companies that have at Least One Report Not Concordant: 4
Organization Name Difference in Years Fiscal Year Recent - Previously Reported
Fort Wayne Ballet Comparing Current
Year Minus 1
2018 2,568
Pittsburgh Ballet Theatre Comparing Current
Year Minus 1
2019 80,765
Pittsburgh Ballet Theatre Comparing Current
Year Minus 2
2020 80,765
Pittsburgh Ballet Theatre Comparing Current
Year Minus 1
2021 25,082
Pittsburgh Ballet Theatre Comparing Current
Year Minus 3
2021 80,765
San Francisco Ballet Comparing Current
Year Minus 2
2017 1,133,639
San Francisco Ballet Comparing Current
Year Minus 3
2018 1,133,639
San Francisco Ballet Comparing Current
Year Minus 4
2019 1,133,639
The Alabama Ballet Comparing Current
Year Minus 3
2020 30,742
Variable: OtherExpendituresAmt
Comparing Values Reported in More Recent Report to Those Previously Reported:
Number of Companies that have at Least One Report Not Concordant: 4
Organization Name Difference in Years Fiscal Year Recent - Previously Reported
Fort Wayne Ballet Comparing Current
Year Minus 1
2019 3,149
Fort Wayne Ballet Comparing Current
Year Minus 2
2020 3,149
Pittsburgh Ballet Theatre Comparing Current
Year Minus 1
2020 300
Pittsburgh Ballet Theatre Comparing Current
Year Minus 2
2021 300
San Francisco Ballet Comparing Current
Year Minus 1
2017 5,795,836
San Francisco Ballet Comparing Current
Year Minus 2
2017 5,498,836
San Francisco Ballet Comparing Current
Year Minus 2
2018 5,795,836
San Francisco Ballet Comparing Current
Year Minus 3
2018 5,498,836
San Francisco Ballet Comparing Current
Year Minus 3
2019 5,795,836
San Francisco Ballet Comparing Current
Year Minus 4
2019 5,498,836
San Francisco Ballet Comparing Current
Year Minus 4
2020 5,795,836
The Alabama Ballet Comparing Current
Year Minus 3
2020 23,489
The Alabama Ballet Comparing Current
Year Minus 4
2020 25,201
Variable: EndYearBalanceAmt
Comparing Values Reported in More Recent Report to Those Previously Reported:
Number of Companies that have at Least One Report Not Concordant: 8
Organization Name Difference in Years Fiscal Year Recent - Previously Reported
Ballet Arizona Comparing Current
Year Minus 1
2018 973,030
Ballet Arizona Comparing Current
Year Minus 2
2018 4,025,025
Ballet Arizona Comparing Current
Year Minus 3
2018 500,000
Ballet Arizona Comparing Current
Year Minus 2
2019 973,030
Ballet Arizona Comparing Current
Year Minus 3
2019 4,025,025
Ballet Arizona Comparing Current
Year Minus 4
2019 500,000
Ballet Arizona Comparing Current
Year Minus 3
2020 973,030
Ballet Arizona Comparing Current
Year Minus 4
2020 4,025,025
Fort Wayne Ballet Comparing Current
Year Minus 1
2018 38,699
Fort Wayne Ballet Comparing Current
Year Minus 2
2019 52,042
Fort Wayne Ballet Comparing Current
Year Minus 3
2020 52,042
Joffrey Ballet Comparing Current
Year Minus 1
2017 41,702
Joffrey Ballet Comparing Current
Year Minus 2
2018 41,702
Joffrey Ballet Comparing Current
Year Minus 3
2019 41,702
Joffrey Ballet Comparing Current
Year Minus 4
2020 41,702
Pacific Northwest Ballet Comparing Current
Year Minus 1
2019 3,000
Pacific Northwest Ballet Comparing Current
Year Minus 2
2020 3,000
Pittsburgh Ballet Theatre Comparing Current
Year Minus 1
2019 80,765
Pittsburgh Ballet Theatre Comparing Current
Year Minus 2
2020 80,765
Pittsburgh Ballet Theatre Comparing Current
Year Minus 1
2021 25,082
Pittsburgh Ballet Theatre Comparing Current
Year Minus 3
2021 80,765
San Francisco Ballet Comparing Current
Year Minus 1
2017 100,218,835
San Francisco Ballet Comparing Current
Year Minus 2
2017 107,033,401
San Francisco Ballet Comparing Current
Year Minus 2
2018 100,218,835
San Francisco Ballet Comparing Current
Year Minus 3
2018 107,033,401
San Francisco Ballet Comparing Current
Year Minus 3
2019 100,218,835
San Francisco Ballet Comparing Current
Year Minus 4
2019 107,033,401
San Francisco Ballet Comparing Current
Year Minus 4
2020 100,218,835
Texas Ballet Theater Comparing Current
Year Minus 1
2016 143
Texas Ballet Theater Comparing Current
Year Minus 2
2017 143
Texas Ballet Theater Comparing Current
Year Minus 3
2018 143
Texas Ballet Theater Comparing Current
Year Minus 4
2019 143
The Alabama Ballet Comparing Current
Year Minus 1
2019 196,591
The Alabama Ballet Comparing Current
Year Minus 2
2020 196,591
The Alabama Ballet Comparing Current
Year Minus 3
2020 227,040
The Alabama Ballet Comparing Current
Year Minus 4
2020 219,787

Companies with Discordant Reporting for at Least One Variable

# variables corresponding to number of companies with at least one discordance
crossref_all %>%
      pivot_longer(cols = contains("difference")) %>%
      select(EIN, fiscal_year, name, value, variable) %>%
      filter(value > 0) %>%
      group_by(EIN) %>%
      summarize(
                number_variables = length(unique(variable)),
                variable = paste(unique(variable), collapse=",<br>")) %>%
  left_join(companies_to_ein) %>%
  arrange(organization_name) %>%
  select(`Organization Name` = `organization_name`,
         `Number of Variables Discordant` = number_variables,
         `Variables with Discordant Reporting` = variable) %>%
  make_table(
    title = "Companies with Discordant Reporting for at Least One Variable",
    escape=FALSE)
Companies with Discordant Reporting for at Least One Variable
Organization Name Number of Variables Discordant Variables with Discordant Reporting
Ballet Arizona 2 BeginningYearBalanceAmt,
EndYearBalanceAmt
Fort Wayne Ballet 4 BeginningYearBalanceAmt,
InvestmentEarningsOrLossesAmt,
OtherExpendituresAmt,
EndYearBalanceAmt
Joffrey Ballet 2 ContributionsAmt,
EndYearBalanceAmt
Pacific Northwest Ballet 2 BeginningYearBalanceAmt,
EndYearBalanceAmt
Pittsburgh Ballet Theatre 3 InvestmentEarningsOrLossesAmt,
OtherExpendituresAmt,
EndYearBalanceAmt
San Francisco Ballet 5 BeginningYearBalanceAmt,
ContributionsAmt,
InvestmentEarningsOrLossesAmt,
OtherExpendituresAmt,
EndYearBalanceAmt
Texas Ballet Theater 1 EndYearBalanceAmt
The Alabama Ballet 4 BeginningYearBalanceAmt,
InvestmentEarningsOrLossesAmt,
OtherExpendituresAmt,
EndYearBalanceAmt
# for each variable, list of EINs that have at least one discordance 
intersections <- crossref_all %>%
      pivot_longer(cols = contains("difference")) %>%
      select(EIN, fiscal_year, name, value, variable) %>%
      filter(value > 0) %>%
      group_by(variable) %>%
      summarize(EINs = list(unique(EIN)))

discord_in_all <- Reduce(intersect, intersections$EINs) %>% unique() %>% length() 
discord_at_least_one <- Reduce(union, intersections$EINs) %>% unique() %>% length() 

The number of companies with a discordant report for all variables was 1, and the number of companies with at least one discordant report for all variables was 8.

# visualize discordances in given variable_name 
plot_reported_for_variable <- function(variable_name, crossref, endowment) {
  
  cross_ref_for_var <- crossref %>%
      filter(variable == variable_name) %>%
      pivot_longer(cols = contains("difference")) %>%
      select(EIN, fiscal_year, name, value) %>%
      filter(value > 0) 
    
  discordant <- cross_ref_for_var %>%
      pull(EIN) %>% unique()
  
  number_cols <- ifelse(length(discordant) <= 6, 1,2)
  
  
  # plot the values for the year they correspond to so we can compare,
  # for example, if CM1 for 2016 is the same as CY for 2015
  endowment %>%
    filter(EIN %in% discordant) %>%
    select(EIN, fiscal_year, contains(variable_name)) %>%
    group_by(EIN) %>%
    arrange(fiscal_year) %>% 
    pivot_longer(3: ncol(.)) %>%
    mutate(source = ifelse(grepl("CYM", name), substr(name, 1,4), "CY"),
           year_lag = ifelse(grepl("CYM", name), substr(source, 4,4), 0),
           year_lag = as.numeric(year_lag),
           fiscal_year = as.integer(paste0(fiscal_year))) %>%
    mutate(value_year = fiscal_year -year_lag
           ) %>%
    left_join(companies_to_ein) %>%
    mutate(organization_name = paste0(organization_name, 
                                      " (EIN: ", EIN, ")")) %>%
    ggplot(aes(x = value_year, y = value)) +
    geom_jitter(aes(fill=source), height  =0, 
                width = .2,
                alpha = .8,
                size = 2.2,
                shape =21,
                color = "black",
                stroke =.4) +
   # geom_line(aes(group = source, color = source)) +
    facet_wrap(~organization_name, scales= 'free_y', ncol = number_cols) +
    scale_x_continuous(breaks = 2011:2021 ) +
    scale_y_continuous(labels = comma) +
    viridis::scale_fill_viridis(option="magma", discrete=TRUE) +
    theme_bw() +
    labs(x = "Fiscal Year",
         y = "Reported Value (Dollars)",
         title = paste0("Comparing Reported Values for ", variable_name),
         subtitle = "Only Considering Companies with at Least One Discordant Value") +
    theme(plot.title = element_text(
      face = "bold",
      hjust = .5, 
      size = 16),
      axis.title = element_text(face = "bold", size =16),
      axis.text = element_text(size = 12),
      strip.text = element_text(face = "bold", size = 14),
      plot.subtitle=element_text(size =14, 
                                 face="italic",
                                 hjust = .5),
      legend.text = element_text(size = 10),
      legend.title = element_text(face = "bold", size = 12)) +
    guides(legend = guide_legend(override.aes = list(size = 3)))

}

# plot variables by year, by variable only for EINs that have
# at least one discordance for a given variable
walk(unique(variables_to_check),
     ~ {plt <- plot_reported_for_variable(
       variable_name = .x,
       crossref = crossref_all,
       endowment = endowment_data)
     print(plt) })

Questions to Consider

  • Should we assume the most recently reported values are (the most) accurate?
crossref %>% 
  select(EIN, contains("difference")) %>%
  pivot_longer(cols = contains("difference")) %>%
  group_by(name) %>%
  mutate(count_na = sum(is.na(value)),
            count_not_na = sum(!is.na(value))) %>%
  ungroup() %>%
  ggplot(aes(x = value)) +
 # geom_boxplot() +
  geom_histogram(bins = 50)

crossref %>% 
  select(EIN, contains("difference")) %>%
  pivot_longer(cols = contains("difference")) %>%
  group_by(name) %>%
  mutate(count_na = sum(is.na(value)),
            count_not_na = sum(!is.na(value))) %>%
  ungroup() %>%
  ggplot(aes(x = name, y=value)) +
  geom_boxplot() +
  geom_jitter(alpha = .5, height = 0, width = .01)

crossref %>% 
  select(EIN, contains("difference")) %>%
  pivot_longer(cols = contains("difference")) %>%
  group_by(name) %>%
  mutate(count_na = sum(is.na(value)),
            count_not_na = sum(!is.na(value))) %>%
  ungroup() %>%
  ggplot(aes(x = name, y = value)) +
 # geom_boxplot() +
  geom_density()



  geom_histogram() +
  facet_wrap(~name) +
  scale_x_log10()